#!/usr/bin/env python

import nltk
from nltk.corpus import gutenberg 

emma = gutenberg.words('austen-emma.txt')
print(len(emma))

all_file = gutenberg.fileids()
for id in all_file:
	num_chars = len(gutenberg.raw(id))
	words = gutenberg.words(id)
	num_words = len(words)
	num_sents = len(gutenberg.sents(id))
	num_vocab = len(set([w.lower for w in words]))
	print num_chars, num_words, num_vocab, num_sents, id
